분석 개요


1 분석환경 준비

1.1 기본 환경 설정

knitr::opts_chunk$set(echo=TRUE, error=FALSE)

1.2 필수 패키지 적재

library(readxl)       # 데이터 수집
library(dplyr)        # 데이터 가공
library(reshape2)     # 데이터 가공
library(psych)        # 통계
library(corrplot)     # 통계
library(ggplot2)      # 시각화
library(gridExtra)    # 시각화
library(RColorBrewer) # 시각화
library(plotly)       # 시각화

## 플롯 한글깨짐 방지 설정
par(family = "NanumGothic")
theme_set(theme_gray(base_family='NanumGothic')) 

2 데이터 수집

한국수원공사(K-Water)에 취득한 압력, 수질, 유량 등이 포함되 데이터와 기상청 공공데이터에서 취득한 강수량 자료를 수집 함

dataset <- read_excel("../data/와부 정수.xlsx")
rain <- read_excel("../data/서울강수량.xlsx")

dataset <- dataset %>% mutate(일자 = substr(발생일시, 1, 8))
rain$일자 <- as.character(rain$일자)
season_num <- c(4, 4, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4)

water_origin <- left_join(dataset, rain, by = "일자") %>%
  mutate(발생월 = as.integer(substr(발생일시, 5, 6))) %>%
  mutate(발생일 = as.integer(substr(발생일시, 7, 8))) %>%
  mutate(발생시간 = as.integer(substr(발생일시, 9, 10))) %>%
  mutate(계절 = season_num[as.integer(발생월)]) %>%
  select("발생월", "발생일", "발생시간", "압력1", 
         "수위1", "수위2", "유량1", "유량2", "강수량", 
         "계절", "PH", "탁도", "잔류염소") %>%
  filter(PH != 0 & 탁도 != 0 & 잔류염소 != 0 & 
           압력1 != 0 & 수위1 != 0 & 수위2 != 0 & 
           유량1 != 0 & 유량2 != 0) %>%
  arrange(발생월, 발생일, 발생시간)

water <- water_origin

3 데이터 정제

3.1 데이터의 구조 및 기초통계 확인

glimpse(water)
## Observations: 7,326
## Variables: 13
## $ 발생월   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ 발생일   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ 발생시간 <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ 압력1    <dbl> 2.285, 2.278, 2.281, 2.480, 1.955, 2.235, 2.088, 2.486,...
## $ 수위1    <dbl> 2.79850, 2.76775, 2.83550, 2.95525, 2.80475, 2.43900, 2...
## $ 수위2    <dbl> 0.11875, 0.11900, 0.11925, 0.11925, 0.11975, 0.12150, 0...
## $ 유량1    <dbl> 3107.550, 3234.750, 3134.617, 2684.667, 2490.733, 2822....
## $ 유량2    <dbl> 7.3950, 8.4583, 8.0040, 9.1350, 9.0963, 9.7247, 8.8063,...
## $ 강수량   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ 계절     <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4...
## $ PH       <dbl> 7.4345, 7.4384, 7.4403, 7.4443, 7.4453, 7.4457, 7.445...
## $ 탁도     <dbl> 0.0362, 0.0361, 0.0363, 0.0365, 0.0365, 0.0366, 0.0364,...
## $ 잔류염소 <dbl> 0.7659, 0.7560, 0.7477, 0.7367, 0.7255, 0.7220, 0.7293, 0...
summary(water)
##      발생월           발생일         발생시간         압력1      
##  Min.   : 1.000   Min.   : 1.00   Min.   : 1.00   Min.   :1.659  
##  1st Qu.: 3.000   1st Qu.: 8.00   1st Qu.: 6.00   1st Qu.:2.184  
##  Median : 6.000   Median :16.00   Median :11.00   Median :2.309  
##  Mean   : 6.321   Mean   :15.85   Mean   :11.97   Mean   :2.334  
##  3rd Qu.:10.000   3rd Qu.:24.00   3rd Qu.:18.00   3rd Qu.:2.471  
##  Max.   :12.000   Max.   :31.00   Max.   :24.00   Max.   :3.348  
##      수위1           수위2             유량1          유량2        
##  Min.   :1.437   Min.   :0.05375   Min.   :1514   Min.   : 0.0097  
##  1st Qu.:2.752   1st Qu.:0.07700   1st Qu.:3224   1st Qu.: 0.1837  
##  Median :2.904   Median :0.09625   Median :3598   Median : 1.4403  
##  Mean   :2.876   Mean   :0.09487   Mean   :3569   Mean   : 2.9361  
##  3rd Qu.:3.036   3rd Qu.:0.10975   3rd Qu.:3938   3rd Qu.: 4.9083  
##  Max.   :3.644   Max.   :0.21700   Max.   :6308   Max.   :16.2013  
##      강수량             계절             PH             탁도        
##  Min.   :  0.000   Min.   :1.000   Min.   :6.767   Min.   :0.01990  
##  1st Qu.:  0.000   1st Qu.:1.000   1st Qu.:7.143   1st Qu.:0.03560  
##  Median :  0.000   Median :3.000   Median :7.238   Median :0.04250  
##  Mean   :  2.854   Mean   :2.562   Mean   :7.268   Mean   :0.04027  
##  3rd Qu.:  0.200   3rd Qu.:4.000   3rd Qu.:7.403   3rd Qu.:0.04330  
##  Max.   :108.500   Max.   :4.000   Max.   :7.774   Max.   :0.10980  
##     잔류염소     
##  Min.   :0.4845  
##  1st Qu.:0.7933  
##  Median :0.8185  
##  Mean   :0.8186  
##  3rd Qu.:0.8479  
##  Max.   :1.0658

3.2 변수별 이상치 및 결측치 제거

opar <- par(mfrow = c(2,3))

# 압력1 이상치 제거
stat <- boxplot(water$압력1, horizontal = T)$stats  # Boxplot 기초통계
stat
##       [,1]
## [1,] 1.760
## [2,] 2.184
## [3,] 2.309
## [4,] 2.471
## [5,] 2.894
out_below <- stat[1, 1]  # 아래쪽 극단치 경계
out_upper <- stat[5, 1]  # 위쪽 극단치 경계
out_below; out_upper
## [1] 1.76
## [1] 2.894
water$압력1 <- ifelse(water$압력1 < out_below |
                      water$압력1 > out_upper, NA, water$압력1 )

table(is.na(water$압력1))
## 
## FALSE  TRUE 
##  7230    96
# 수위1 이상치 제거
stat <- boxplot(water$수위1, horizontal = T)$stats  # Boxplot 기초통계
stat
##         [,1]
## [1,] 2.32700
## [2,] 2.75225
## [3,] 2.90400
## [4,] 3.03600
## [5,] 3.46050
out_below <- stat[1, 1]  # 아래쪽 극단치 경계
out_upper <- stat[5, 1]  # 위쪽 극단치 경계
out_below; out_upper
## [1] 2.327
## [1] 3.4605
water$수위1 <- ifelse(water$수위1 < out_below |
                      water$수위1 > out_upper, NA, water$수위1 )

table(is.na(water$수위1))
## 
## FALSE  TRUE 
##  7134   192
# 수위2 이상치 제거
stat <- boxplot(water$수위2, horizontal = T)$stats  # Boxplot 기초통계
stat
##         [,1]
## [1,] 0.05375
## [2,] 0.07700
## [3,] 0.09625
## [4,] 0.10975
## [5,] 0.15100
out_below <- stat[1, 1]  # 아래쪽 극단치 경계
out_upper <- stat[5, 1]  # 위쪽 극단치 경계
out_below; out_upper
## [1] 0.05375
## [1] 0.151
water$수위2 <- ifelse(water$수위2 < out_below |
                      water$수위2 > out_upper, NA, water$수위2 )

table(is.na(water$수위2))
## 
## FALSE  TRUE 
##  7325     1
# 유량1 이상치 제거
stat <- boxplot(water$유량1, horizontal = T)$stats  # Boxplot 기초통계
stat
##          [,1]
## [1,] 2153.400
## [2,] 3223.717
## [3,] 3597.708
## [4,] 3938.233
## [5,] 5009.233
out_below <- stat[1, 1]  # 아래쪽 극단치 경계
out_upper <- stat[5, 1]  # 위쪽 극단치 경계
out_below; out_upper
## [1] 2153.4
## [1] 5009.233
water$유량1 <- ifelse(water$유량1 < out_below |
                      water$유량1 > out_upper, NA, water$유량1 )

table(is.na(water$유량1))
## 
## FALSE  TRUE 
##  7261    65
# 유량2 이상치 제거
stat <- boxplot(water$유량2, horizontal = T)$stats  # Boxplot 기초통계
stat
##         [,1]
## [1,]  0.0097
## [2,]  0.1837
## [3,]  1.4403
## [4,]  4.9107
## [5,] 11.9963
out_below <- stat[1, 1]  # 아래쪽 극단치 경계
out_upper <- stat[5, 1]  # 위쪽 극단치 경계
out_below; out_upper
## [1] 0.0097
## [1] 11.9963
water$유량2 <- ifelse(water$유량2 < out_below |
                      water$유량2 > out_upper, NA, water$유량2 )

table(is.na(water$유량2))
## 
## FALSE  TRUE 
##  7215   111
par(opar)

opar <- par(mfrow = c(2,3))

boxplot(water$압력1, horizontal = T)
boxplot(water$수위1, horizontal = T)
boxplot(water$수위2, horizontal = T)
boxplot(water$유량1, horizontal = T)
boxplot(water$유량2, horizontal = T)

par(opar)

# 결측치(NA) 제거
water_new <- water %>% 
  filter(!is.na(압력1) & !is.na(수위1) & !is.na(수위2) & 
           !is.na(유량1) & !is.na(유량2))

glimpse(water_new)
## Observations: 6,883
## Variables: 13
## $ 발생월   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ 발생일   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ 발생시간 <int> 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1...
## $ 압력1    <dbl> 2.285, 2.278, 2.281, 2.480, 1.955, 2.235, 2.088, 2.406,...
## $ 수위1    <dbl> 2.79850, 2.76775, 2.83550, 2.95525, 2.80475, 2.43900, 2...
## $ 수위2    <dbl> 0.11875, 0.11900, 0.11925, 0.11925, 0.11975, 0.12150, 0...
## $ 유량1    <dbl> 3107.550, 3234.750, 3134.617, 2684.667, 2490.733, 2822....
## $ 유량2    <dbl> 7.3950, 8.4583, 8.0040, 9.1350, 9.0963, 9.7247, 8.8063,...
## $ 강수량   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ 계절     <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4...
## $ PH       <dbl> 7.4345, 7.4384, 7.4403, 7.4443, 7.4453, 7.4457, 7.445...
## $ 탁도     <dbl> 0.0362, 0.0361, 0.0363, 0.0365, 0.0365, 0.0366, 0.0364,...
## $ 잔류염소 <dbl> 0.7659, 0.7560, 0.7477, 0.7367, 0.7255, 0.7220, 0.7293, 0...

4 데이터 분석

4.1 상관관계 분석

WaterCorr <- cor(water_new)
corrplot(WaterCorr,method = "square")

4.2 단순회귀분석

  • 종속변수 : 탁도
  • 독립변수 : 압력1, 수위1, 수위2, 유량2
p1 <- water_new %>% 
  ggplot(aes(유량2, 탁도)) + geom_count(alpha = .5) + 
  geom_smooth(method = "lm")

ggplotly(p1)
p2 <- water_new %>% 
  ggplot(aes(수위1, 탁도)) + geom_count(alpha = .5) + 
  geom_smooth(method = "lm")

ggplotly(p2)
p3 <- water_new %>%
  ggplot(aes(수위2, 탁도)) + geom_count(alpha = .5) + 
  geom_smooth(method = "lm")

ggplotly(p3)
p4 <- water_new %>%
  ggplot(aes(압력1, 탁도)) + geom_count(alpha = .5) + 
  geom_smooth(method = "lm")

ggplotly(p4)
grid.arrange(p1, p2, p3, p4, ncol=2, nrow=2)

# 단순회귀분석 정확도 분석 (탁도기준)
s0 <- lm(formula = 탁도 ~ 유량2, data = water_new) # 24.7%
summary(s0)
## 
## Call:
## lm(formula = 탁도 ~ 유량2, data = water_new)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.021443 -0.001503  0.000637  0.002069  0.072483 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.244e-02  6.661e-05  637.14   <2e-16 ***
## 유량2       -7.684e-04  1.617e-05  -47.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.004133 on 6881 degrees of freedom
## Multiple R-squared:  0.2471, Adjusted R-squared:  0.247 
## F-statistic:  2259 on 1 and 6881 DF,  p-value: < 2.2e-16
s1 <- lm(formula = 탁도 ~ 수위1, data = water_new) # 0.24%
summary(s1)
## 
## Call:
## lm(formula = 탁도 ~ 수위1, data = water_new)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.020047 -0.004189  0.002106  0.002927  0.069526 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0438732  0.0008381  52.351  < 2e-16 ***
## 수위1       -0.0012193  0.0002886  -4.225 2.42e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.004757 on 6881 degrees of freedom
## Multiple R-squared:  0.002588,   Adjusted R-squared:  0.002443 
## F-statistic: 17.85 on 1 and 6881 DF,  p-value: 2.418e-05
s2 <- lm(formula = 탁도 ~ 수위2, data = water_new) # 3.94%
summary(s2)
## 
## Call:
## lm(formula = 탁도 ~ 수위2, data = water_new)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.020355 -0.003407  0.001560  0.003289  0.071300 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.045127   0.000289  156.14   <2e-16 ***
## 수위2       -0.050875   0.003013  -16.88   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.004667 on 6881 degrees of freedom
## Multiple R-squared:  0.03978,    Adjusted R-squared:  0.03964 
## F-statistic: 285.1 on 1 and 6881 DF,  p-value: < 2.2e-16
s3 <- lm(formula = 탁도 ~ 압력1, data = water_new) # 11.5%
summary(s3)
## 
## Call:
## lm(formula = 탁도 ~ 압력1, data = water_new)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.017911 -0.002787  0.001307  0.002952  0.067915 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.0221533  0.0006095   36.35   <2e-16 ***
## 압력1       0.0078208  0.0002611   29.96   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.00448 on 6881 degrees of freedom
## Multiple R-squared:  0.1154, Adjusted R-squared:  0.1152 
## F-statistic: 897.4 on 1 and 6881 DF,  p-value: < 2.2e-16

4.3 다중회귀분석

### 다중회귀분석 (탁도기준)
x0 <- lm(탁도~유량1 + 유량2 + 수위1 + 수위2 + 압력1, data = water_new) #유의한 인자 찾기 1
anova(x0) # 유량1 유의하지 않은 인자로 확인되어 제외
## Analysis of Variance Table
## 
## Response: 탁도
##             Df   Sum Sq  Mean Sq   F value    Pr(>F)    
## 유량1        1 0.000004 0.000004    0.2801    0.5967    
## 유량2        1 0.041028 0.041028 2747.0099 < 2.2e-16 ***
## 수위1        1 0.000732 0.000732   49.0141 2.782e-12 ***
## 수위2        1 0.001997 0.001997  133.7210 < 2.2e-16 ***
## 압력1        1 0.009638 0.009638  645.3112 < 2.2e-16 ***
## Residuals 6877 0.102711 0.000015                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
x1 <- lm(탁도~유량2 + 수위1 + 수위2 + 압력1, data = water_new)
anova(x0, x1) #유량1 제외시 회귀분석 결과가 보다 정확해진다 함
## Analysis of Variance Table
## 
## Model 1: 탁도 ~ 유량1 + 유량2 + 수위1 + 수위2 + 압력1
## Model 2: 탁도 ~ 유량2 + 수위1 + 수위2 + 압력1
##   Res.Df     RSS Df   Sum of Sq      F Pr(>F)
## 1   6877 0.10271                             
## 2   6878 0.10271 -1 -6.9005e-07 0.0462 0.8298
anova(x1)
## Analysis of Variance Table
## 
## Response: 탁도
##             Df   Sum Sq  Mean Sq  F value    Pr(>F)    
## 유량2        1 0.038578 0.038578 2583.314 < 2.2e-16 ***
## 수위1        1 0.000452 0.000452   30.294 3.847e-08 ***
## 수위2        1 0.002814 0.002814  188.434 < 2.2e-16 ***
## 압력1        1 0.011555 0.011555  773.744 < 2.2e-16 ***
## Residuals 6878 0.102712 0.000015                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(x1) #회귀식이 34.1% 정도 정확도를 갖게 됨
## 
## Call:
## lm(formula = 탁도 ~ 유량2 + 수위1 + 수위2 + 압력1, 
##     data = water_new)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.018885 -0.001823  0.000430  0.002116  0.070464 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.019e-02  8.681e-04  34.777  < 2e-16 ***
## 유량2       -8.284e-04  1.975e-05 -41.952  < 2e-16 ***
## 수위1       -1.861e-03  2.393e-04  -7.776 8.56e-15 ***
## 수위2        2.996e-02  3.256e-03   9.201  < 2e-16 ***
## 압력1        6.447e-03  2.318e-04  27.816  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.003864 on 6878 degrees of freedom
## Multiple R-squared:  0.3421, Adjusted R-squared:  0.3417 
## F-statistic: 893.9 on 4 and 6878 DF,  p-value: < 2.2e-16
x2 <- lm(탁도~유량2 + 수위2 + 압력1, data = water_new) #수위1이 피벨류가 제일 높아 제외하고 돌려봄
anova(x1, x2) #수위1을 빼지 말라고 함 / 수위1가 유의한 인자라고 함
## Analysis of Variance Table
## 
## Model 1: 탁도 ~ 유량2 + 수위1 + 수위2 + 압력1
## Model 2: 탁도 ~ 유량2 + 수위2 + 압력1
##   Res.Df     RSS Df   Sum of Sq     F    Pr(>F)    
## 1   6878 0.10271                                   
## 2   6879 0.10361 -1 -0.00090303 60.47 8.563e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
opar <- par(mfrow=c(2,2))

plot(x1)

par(opar)

Have fun!